In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes
In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
In [4]:
# Calling get_features_for_blocking or get_features_for_matching without setting the flag
# validate_inferred_attr_types to false will result in a validation process that shows the
# user a table containing all of the inferred attribute correspondence and inferred types.
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B)
# Get features (for matching)
# feature_table = em.get_features_for_blocking(A, B)
In [5]:
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['name'] + ' ' + ltuple['address']).lower()),
wspace((rtuple['name'] + ' ' + rtuple['address']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'jac_ws_name_address', feature)
Out[5]:
In [6]:
feature_table.feature_name
Out[6]:
In [7]:
import fuzzywuzzy.StringMatcher as fz
In [8]:
fz.ratio('xyz', 'ayz')
Out[8]:
In [9]:
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
In [10]:
sim['fz_ratio'] = fz.ratio
In [11]:
sim
Out[11]:
In [12]:
feature_string = """fz_ratio((ltuple['name'] + ' ' + ltuple['address']).lower(),
(rtuple['name'] + ' ' + rtuple['address']).lower())"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(feature_table, 'fzratio_name_address', feature)
Out[12]:
In [13]:
feature_table.feature_name
Out[13]:
In [14]:
import fuzzywuzzy.StringMatcher as fz
In [15]:
def my_feature(ltuple, rtuple):
return(ltuple['name'], rtuple['name'])
In [16]:
feature_table = em.get_features_for_blocking(A, B)
In [17]:
help(em.add_blackbox_feature)
In [18]:
em.add_blackbox_feature(feature_table, 'blackbox_fz_ratio_name', my_feature)
Out[18]:
In [19]:
feature_table.feature_name
Out[19]: